{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import *\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "from sklearn.metrics import accuracy_score\n",
    "import time\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from gensim.models import FastText, Word2Vec\n",
    "import re\n",
    "import random as rn\n",
    "import gc\n",
    "import logging\n",
    "tqdm.pandas()\n",
    "os.environ['PYTHONHASHSEED'] = '0'\n",
    "np.random.seed(1017)\n",
    "rn.seed(1017)\n",
    "path=\"data/\"\n",
    "os.listdir(\"data/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读入数据（需加速）\n",
    "def get_age_data():\n",
    "    train_data = pd.read_csv(path + 'age_train.csv', header=None)\n",
    "    test_data = pd.read_csv(path + 'age_test.csv', header=None)\n",
    "    data = pd.concat([train_data, test_data], axis=0, sort=False).fillna(-1)\n",
    "    data.columns = ['uId', 'age_group']\n",
    "    return data\n",
    "\n",
    "def get_user_app_actived():\n",
    "    data = pd.read_csv(path + 'user_app_actived.csv', header=None)\n",
    "    data.columns = ['uId', 'appId']\n",
    "    return data\n",
    "\n",
    "def get_user_behavior_info():\n",
    "    data = pd.read_csv(path + 'user_behavior_info.csv', header=None)\n",
    "    data.columns = ['uId', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes',\n",
    "                   'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']\n",
    "    return data\n",
    "\n",
    "def get_user_basic_info():\n",
    "    data = pd.read_csv(path + 'user_basic_info.csv', header=None)\n",
    "    data.columns = ['uId', 'gender', 'city', 'prodName', 'ramCapacity', \n",
    "                   'ramLeftRation', 'romCapacity', 'romLeftRation', 'color',\n",
    "                   'fontSize', 'ct', 'carrier', 'os']\n",
    "    return data\n",
    "\n",
    "def get_app_info():\n",
    "    data = pd.read_csv(path + 'app_info.csv', header=None)\n",
    "    data.columns = ['appId', 'category']\n",
    "    return data\n",
    "\n",
    "# 测试的时候用True\n",
    "# 提特征改用False\n",
    "def get_user_app_usage(less_data=False):\n",
    "    if less_data:\n",
    "        reader = pd.read_csv(path + 'user_app_usage.csv', chunksize=2000000)\n",
    "        for i in reader:\n",
    "            data = i\n",
    "            break\n",
    "    else:\n",
    "        data = pd.read_csv(path + 'user_app_usage.csv', header=None)\n",
    "    data.columns = ['uId', 'appId', 'duration', 'times', 'use_date']\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'get_user_app_usage' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-64cc81a1d7c4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_user_app_usage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mless_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'uId'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'app'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'peroid'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'times'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'start'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'peroid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'start'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'times'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'get_user_app_usage' is not defined"
     ]
    }
   ],
   "source": [
    "packtime_all = get_user_app_usage(less_data=False)\n",
    "packtime_all.columns = ['uId', 'app', 'peroid', 'times', 'start']\n",
    "del packtime_all['peroid']\n",
    "packtime_all = packtime_all.sort_values('start')\n",
    "packtime_all = packtime_all.sort_values('times', ascending=False)\n",
    "\n",
    "train_data = pd.read_csv(path + 'age_train.csv', header=None)\n",
    "test_data = pd.read_csv(path + 'age_test.csv', header=None)\n",
    "del train_data[1]\n",
    "train_data.columns = ['uId']\n",
    "test_data.columns = ['uId']\n",
    "\n",
    "d1 = train_data[:500000]\n",
    "d2 = train_data[500000:1000000]\n",
    "d3 = train_data[1000000:1500000]\n",
    "d4 = train_data[1500000:]\n",
    "d5 = test_data\n",
    "\n",
    "def set_first_times(row):\n",
    "    return ' '.join(list(row['app']))\n",
    "\n",
    "df_value = []\n",
    "for i in tqdm([d1, d2, d3, d4, d5]):\n",
    "    i = pd.merge(i, packtime_all, on='uId', how='left')\n",
    "    i =i.fillna(0)\n",
    "    i['app'] = i['app'].astype(str)\n",
    "    group_data = i.groupby(['uId', 'start']).progress_apply(lambda row: set_first_times(row)).reset_index()\n",
    "    group_data.columns = ['uId', 'start', 'app']\n",
    "    group_data =  group_data.groupby('uId').progress_apply(lambda row:set_first_times(row)).reset_index()\n",
    "    df_value.append(group_data)\n",
    "data = pd.concat([df_value[0], df_value[1], df_value[2], df_value[3], df_value[4]], axis=0, sort=False)\n",
    "data.to_csv('data/set_series_1.csv', index=False, header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
